**----------------------------------------------------------------------------**
**--Local Corruption & Household Business Tax Compliance
**--Duong Le, Eddy Malesky, Anh Pham
**--Journal of Economic Behavior and Organization (July 2020)
**----------------------------------------------------------------------------**
**--Do File: Constructing pre-clean pairwise VHBS dataset
**--5 steps: 

**Step 1: create "formal_firm_indicators.dta"
**Step 2: create id to identify observations, trim and rename household business data
**Step 3: merge firms from VHBS2017 into commune-pair directory
**Step 4: merge in other variables/covariates from various other data sets
**Step 5: constructing variables for analysis
**----------------------------------------------------------------------------**

clear all 
set matsize 11000 

**specify your main directory that stores all replication files
global main_directory = "insert your working directory here" 

//set data working directory
cd "$main_directory\data" 

**---------------------------------------------------**
** Step 1: create "formal_firm_indicators.dta"
**---------------------------------------------------**

use "sub_data/enterprise2014.dta", clear /*confidential data on Vietnamese enterprise in 2014, also obtained from the GSO*/
destring xa, replace
rename ld11 ld_end
rename ts12 ts_asset_end
rename kqkd1 kqkd_rev_bus
rename kqkd22 kqkd_prof_bftax

keep tinh huyen xa ld_end ts_asset_end kqkd_rev_bus kqkd_prof_bftax
label data "2014 enterprise data"
destring tinh huyen xa, replace

	egen nfirmf=total(1), by(tinh huyen xa)
    
	foreach var in ld_end ts_asset_end kqkd_rev_bus kqkd_prof_bftax{
	egen `var'me= median(`var'), by(tinh huyen xa) 
	egen `var'm= mean(`var'), by(tinh huyen xa) 
	}

keep if nfirmf<.& ld_endme<.& ld_endm<.& ts_asset_endme<.& ts_asset_endm<. ///
 & kqkd_rev_busme<.& kqkd_rev_busm<.&kqkd_prof_bftaxme<.&kqkd_prof_bftaxm<.
 
keep tinh huyen xa nfirmf ld_endme ld_endm ts_asset_endme ts_asset_endm kqkd_rev_busme kqkd_rev_busm kqkd_prof_bftaxme kqkd_prof_bftaxm
 
duplicates drop tinh huyen xa, force

save "sub_data/formal_firm_indicators.dta", replace
 


 
**----------------------------------------------------------------------------**
**Step 2: ** create id to identify observations, trim and rename household business data
**----------------------------------------------------------------------------**

**create id to identify observations
use "VHBS2017_original.dta", clear // This is the original confidential houshold businesses 2017 data provided by GSO

duplicates drop dchi tinh huyen xa chucs tencs nganh_kd, force

recast str80 tencs chucs dchi, force

g id_nganh=_n

preserve
sort dchi tinh huyen xa chucs
duplicates drop dchi tinh huyen xa chucs tencs, force
g id_firm= _n
keep dchi tinh huyen xa chucs tencs id_firm
save "VHBS2017idfirm".dta, replace
restore

merge m:1 dchi tinh huyen xa chucs tencs using "VHBS2017idfirm.dta"
drop _merge

label var id_firm "uniquely identify firm by address,province,district,commune,owner"
label var id_nganh "uniquely identify firm by address,province,district,commune,owner,industry"

save "VHBS_2017.dta", replace


**----------------------------------------------------------------------------**
** trim and rename household business data
**----------------------------------------------------------------------------**

use "VHBS_2017.dta", clear
** label variables
gen register = .
replace register=1 if dkkd==1|dkkd==3
replace register=0 if dkkd==2|dkkd==4

gen taxid=1 if co_msthue==1
replace taxid=0 if co_msthue==2	

gen sex = gioitinh
g male=1 if sex==1
replace male=0 if sex==2
gen age = 2017 - namsinh
gen educ=tdcm
gen operation_hr=cond(giobd<giokt,giokt-giobd,24-giobd+giokt)
gen operation_month=operation_hr*ngay_bq
gen operation_month_ln = ln(operation_month)
gen location = diadiem
replace educ=0 if educ==9
gen worker_total=ld1
gen ind5=nganh_kd
tostring ind5, gen(ind5_str)

g ind2= substr(ind5_str, 1,1) if nganh_kd<=9999
replace ind2=substr(ind5_str,1,2) if nganh_kd>=10000& nganh_kd<.
destring ind2, replace

g ind3= substr(ind5_str, 1,2) if nganh_kd<=9999
replace ind3=substr(ind5_str,1,3) if nganh_kd>=10000& nganh_kd<.
destring ind3, replace

g ind4= substr(ind5_str, 1,3) if nganh_kd<=9999
replace ind4=substr(ind5_str,1,4) if nganh_kd>=10000& nganh_kd<.
destring ind4, replace
	
g lnworker_total= ln(worker_total)

g nganh07_2dig= ind2
g isic="agri" if nganh07_2dig>=1& nganh07_2dig<=3
replace isic="mining" if nganh07_2dig>=5& nganh07_2dig<=9
replace isic="manufacturing" if nganh07_2dig>=10& nganh07_2dig<=33
replace isic="electric, gas, water supply" if nganh07_2dig>=35& nganh07_2dig<=39
replace isic="construction" if nganh07_2dig>=41& nganh07_2dig<=43
replace isic="commerce" if nganh07_2dig>=45& nganh07_2dig<=47
replace isic="transporation, storage" if nganh07_2dig>=49& nganh07_2dig<=53
replace isic="hotel" if nganh07_2dig==55
replace isic="restaurant" if nganh07_2dig==56
replace isic="communication" if nganh07_2dig>=58& nganh07_2dig<=63
replace isic="finanical" if nganh07_2dig>=64& nganh07_2dig<=75
replace isic="real estate" if nganh07_2dig==68
replace isic="science" if nganh07_2dig>=69& nganh07_2dig<=75  
replace isic="public admin, defence" if nganh07_2dig>=77& nganh07_2dig<=84
replace isic="education" if nganh07_2dig==85
replace isic="health, social work" if nganh07_2dig>=86& nganh07_2dig<=88
replace isic="entertainment" if nganh07_2dig>=90& nganh07_2dig<=93
replace isic="other service" if nganh07_2dig>=94& nganh07_2dig<=96
replace isic="household" if nganh07_2dig>=97& nganh07_2dig<=98
replace isic="international" if nganh07_2dig==99  

g regis_taxid=0 if register==0
replace regis_taxid=1 if register==1& taxid==0
replace regis_taxid=2 if register==1& taxid==1

g rest=1 if isic=="restaurant"
replace rest=0 if isic!="restaurant"
g retail=1 if ind2==47
replace retail=0 if ind2!=47& ind2<.
g easy_evade=1 if isic=="restaurant"| ind2==47
replace easy_evade=0 if isic=="manufacturing"|ind2==46
g manu=1 if isic=="manufacturing"
replace manu=0 if isic!="manufacturing"
g wholesale=1 if ind2==46
replace wholesale=0 if ind2!=46& ind2<.
label var manu "manufacturing"

g lnthudn1= ln(thudn1)

g house=1 if location==1
replace house=0 if location>1& location<.
g operation_hr_6month= operation_hr*ngay_bq*thang_dn


g thue11_r= thue11/thudn1


g nopthue=1 if thue11>0& thue11<.
replace nopthue=0 if thue11==0

gen taxid_surveySample = taxid if phieu!=""

g lnasset1= ln(ts1+1)

g notraining=1 if tdcm==1
replace notraining=0 if tdcm>1& tdcm<.	

g rev_worker= thudn1/worker_total
g rev_worker_hr= thudn1/(worker_total* operation_hr_6month)
g thue11_worker=thue11/worker_total
g thue11_worker_hr= thue11/(worker_total*operation_hr_6month)
g lnrev_worker= ln(rev_worker)

** additional measurse for tax compliance of evasion
g drev1_2_r= (thudn2-thudn1)/thudn1
g thue11_r2= thue11/thudn2


label var thudn1 "revenue"
label var worker_total "#wors"
label var age "age of owner"
label var male "=1 if owner is male"
label var notraining "=1 if owner has no training"
label var manu "=1 if manufacturing"
label var wholesale "=1 if wholesale"
label var rest "=1 if restaurant"
label var retail "=1 if retail"
label var house "=1 if operate in owner's house"

label var taxid "=1 if have a tax ID"

label var operation_hr "daily hours"

keep tinh huyen xa capso macs nganh_kd id_nganh id_firm taxid register sex male age educ operation_hr operation_month operation_month_ln location worker_total ind2 ind3 ind4 ind5 lnworker_total nganh07_2dig isic ///
rest retail easy_evade manu wholesale phieu taxid_surveySample thudn1 thudn2 lnthudn1 lnrev_worker thue11_worker  thue11_worker_hr operation_hr_6month thue11 thue11_r notraining house  drev1_2_r thue11_r2 ts1

**merge with admin directory indicators 
merge m:1 xa using "sub_data/admin3_VES_GIS_directory.dta"
keep if _merge==1|_merge==3
drop _merge 
save "VHBS2017.dta", replace

**save ID-only data set into sub_data folder
preserve
keep tinh huyen xa id_nganh

save "sub_data/VHBS2017_IDonly.dta", replace
restore

**----------------------------------------------------------------------------**
**Step 3: merge firms from VHBS 2017 into commune-pair directory
**----------------------------------------------------------------------------**
cd "$main_directory\data\sub_data" 

**merging together the "skeleton" datasets (i.e., with just commune IDs & firm IDs)
use "pairwise_full_GIS_onlyID.dta", replace

preserve
keep if com_pairID==1
merge 1:m xa using "VHBS2017_IDonly.dta", gen(merge) 
save "firm_commune_pair_IDonly.dta", replace
restore

foreach x of numlist 2/1000 {
	display `x'

	preserve
	keep if com_pairID==`x'
	
	merge 1:m xa using VHBS2017_IDonly.dta, gen(merge) 

	keep if merge==3
	drop merge

	append using firm_commune_pair_IDonly.dta, force
	save firm_commune_pair_IDonly.dta, replace
	restore
	}

	
foreach x of numlist 1001/2000 {
	display `x'

	preserve
	keep if com_pairID==`x'
	
	merge 1:m xa using VHBS2017_IDonly.dta, gen(merge) 

	keep if merge==3
	drop merge

	append using firm_commune_pair_IDonly.dta, force
	save firm_commune_pair_IDonly.dta, replace
	restore
	}
	
	
foreach x of numlist 2001/2672 {
	display `x'

	preserve
	keep if com_pairID==`x'

	merge 1:m xa using VHBS2017_IDonly.dta, gen(merge) 

	keep if merge==3
	drop merge
	*order com_pairID tinh huyen xa id_nganh id_firm
	*sort tinh huyen xa

	append using firm_commune_pair_IDonly.dta, force
	save firm_commune_pair_IDonly.dta, replace
	restore
	}

**--------------------------------------------------------------------------**
*--merge other variables into the ID skeleton--**
cd "$main_directory\data" 
use "sub_data\firm_commune_pair_IDonly.dta", clear

drop merge
*merge the rest of pairwise_full_GIS_onlyID.dta into this master dataset
sort com_pairID tinh huyen xa
merge m:1 com_pairID tinh huyen xa using "sub_data/pairwise_full_GIS_allObs.dta", ///
keepusing(provinceID_2017 districtID_2017 communeID_2017 source_provinceID_GIS source_districtID_GIS source_communeID_GIS lat lon area_km2 boundary_km borderID ///
pci_rank_1 pci_1 pciECindex_1 pciLAindex_1 pciTindex_1 pciTCindex_1 pciICindex_1 pciBindex_1 pciPindex_1 pciBSSidex_1 pciLTindex_1 pciLIindex_1 ///
pci_id arc_id fid_river river near_x near_y saigon bienhoa mytho dongxoai thudaumot tayninh tanan communeid name_2 name_3 name_4 elev slope precip tem suit_tea suit_rice suit_coconut suit_coffee suit_sugar x y y2 x2 xy Bdr_comm Bdr_prov Bdr_dis dist1698 seg10 seg25 seg50 seg75 seg100 treat1698 fid_prov hcmp cap hcmc) gen(merge)

keep if merge==3
drop merge

order com_pairID tinh huyen xa borderID  
sort com_pairID tinh huyen xa
save "firm_commune_pair.dta", replace

*merge firm_2017 variables by id_nganh (unique variable)
merge m:1 id_nganh using "VHBS2017_test.dta", gen(merge)
keep if merge==3 
drop merge
save "firm_commune_pair.dta", replace

**----------------------------------------------------------------------------**
**Step 4: merging in other variables/covariates from various data sets
**----------------------------------------------------------------------------**
cd "$main_directory\data" 
use "firm_commune_pair.dta", clear

drop   provinceN_2017  districtN_2017  communeN_2017

** unit of thue11, thue12, and thue13 is in thousand dong.
label var thue11_r "total tax must pay in the first 6 months"


replace tinh=provinceID_2017 if tinh==.
replace huyen=districtID_2017 if huyen==.

** indicators for economic zone
merge m:1 communeID_2017 using "sub_data/econ_zone.dta"
keep if _merge==3
drop _merge
	
** indicators for formal firms (commune level)
merge m:1 tinh huyen xa using "sub_data/formal_firm_indicators.dta"
gen econ_formal=1 if _merge==3
label var econ_formal "commune with no missing characteristics of formal firms"
keep if _merge==1|_merge==3
drop _merge


** indicators for land characteristics(commune level)
merge m:1 provinceID_2017 districtID_2017 communeID_2017 using "sub_data/land_character.dta", gen(merge)
keep if merge==1|merge==3 
drop merge

** nightlight 2016
merge m:1 provinceID_GIS districtID_GIS communeID_GIS using "sub_data/light2016.dta"
keep if _merge==3
drop _merge


**----------------------------------------------------------**
**-----------Step 5: constructing variables for analysis
**----------------------------------------------------------**

* rescale corruption variable so that higher number means more corruption. 
replace pciICindex_1= 11-pciICindex_1
label var pciICindex_1 "corruption"

** fixed typo
rename pciBSSidex_1 pciBSSindex_1

**creating tax compliance ratio & winsorize variables 
winsor thue11_r, g(taxComp) p(0.01)

g taxComp_imp= taxComp
replace taxComp_imp=0 if taxComp_imp==.& phieu!=""

winsor thue11_r, g(thue11_r99) p(0.01)

foreach var in thue11  lnthudn1 worker_total {
	winsor `var', g(`var'_99) p(0.01)
}

g thue11_r99i=thue11_r99
replace thue11_r99i=0 if thue11_r99==.& phieu!=""

g thue11_99i=thue11_99
replace thue11_99i=0 if thue11_99==.& phieu!=""

foreach var in  lnrev_worker  thue11_worker thue11_worker_hr{
	winsor `var', g(`var'_99) p(0.01)
}

g thue11_worker_99i= thue11_worker_99 if phieu!=""
replace thue11_worker_99i=0 if thue11_worker_99==.& phieu!=""

g thue11_worker_hr_99i= thue11_worker_hr_99 if phieu!=""
replace thue11_worker_hr_99i=0 if thue11_worker_hr_99==.& phieu!=""

foreach var in drev1_2_r thue11_r2 ts1 {
winsor `var', g(`var'_99) p(0.01)
}

g thue11_r2_99i= thue11_r2_99
replace thue11_r2_99i=0 if thue11_r2_99==.& phieu!=""

gen corrupt_worker99 = pciICindex_1*worker_total_99
gen corrupt_operation_hr = pciICindex_1*operation_hr

label var lnthudn1_99 "log revenue"
label var worker_total_99 "\# workers"
	
label var taxComp "tax compliance ratio"
label var taxComp_imp "tax compliance ratio impute 0"	

save "firm_commune_pair.dta", replace






